Lets first import required libraries:

In [1]:

Load Data From CSV File

In [4]:
Out[4]:
PID Fraud DeductibleAmt AmtReimbursed CountOfCID AvgAmtReimb AvgAmtDed CountOfBene AvgReimbPerBene AvgDedPerBene AvgAge
0 PRV51001 No 5340 104640 25 4186 214 24 4360 223 79
1 PRV51003 Yes 66286 605670 132 4588 502 117 5177 567 70
2 PRV51004 No 310 52170 149 350 2 138 378 2 72
3 PRV51005 Yes 3700 280910 1165 241 3 495 567 7 70
4 PRV51007 No 3264 33710 72 468 45 58 581 56 69
In [5]:
Out[5]:
PID                object
Fraud              object
DeductibleAmt       int64
AmtReimbursed       int64
CountOfCID          int64
AvgAmtReimb         int64
AvgAmtDed           int64
CountOfBene         int64
AvgReimbPerBene     int64
AvgDedPerBene       int64
AvgAge              int64
dtype: object
In [6]:
Out[6]:
Index(['PID', 'Fraud', 'DeductibleAmt', 'AmtReimbursed', 'CountOfCID',
       'AvgAmtReimb', 'AvgAmtDed', 'CountOfBene', 'AvgReimbPerBene',
       'AvgDedPerBene', 'AvgAge'],
      dtype='object')
In [7]:
Out[7]:
Fraud count percent cumulative_count cumulative_percent
0 No 4904 90.64695 4904 90.64695
1 Yes 506 9.35305 5410 100.00000
In [8]:
<ipython-input-8-e6662f8ed8ab>:1: MatplotlibDeprecationWarning: 
The matplotlib.backends.qt_editor.formsubplottool module was deprecated in Matplotlib 3.3 and will be removed two minor releases later. Use matplotlib.backends.backend_qt5.SubplotToolQt instead.
  import matplotlib.backends.qt_editor.formsubplottool
Out[8]:
<AxesSubplot:xlabel='Fraud'>
In [9]:
Out[9]:
Text(0.5, 0, 'Fraud')
In [10]:
Out[10]:
Text(0.5, 0, 'Fraud')

Data pre-processing and selection

In [11]:
In [12]:
In [13]:
In [14]:
In [15]:
In [16]:
In [17]:
Out[17]:
Fraud DeductibleAmt AmtReimbursed CountOfCID AvgAmtReimb AvgAmtDed CountOfBene AvgReimbPerBene AvgDedPerBene AvgAge
count 5410.000000 5410.000000 5.410000e+03 5410.000000 5410.000000 5410.000000 5410.000000 5410.000000 5410.000000 5410.000000
mean 0.093530 8078.560444 1.028730e+05 103.181331 1740.695379 153.188355 67.153420 1972.164880 168.214048 73.753420
std 0.291201 24924.637743 2.687367e+05 272.456989 3484.469231 300.936682 142.748729 3709.697633 318.157335 4.725778
min 0.000000 0.000000 0.000000e+00 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 34.000000
25% 0.000000 10.000000 4.460000e+03 10.000000 232.250000 0.000000 8.000000 287.000000 0.000000 72.000000
50% 0.000000 310.000000 1.980500e+04 31.000000 356.000000 4.000000 25.000000 543.500000 7.000000 74.000000
75% 0.000000 5340.000000 8.556500e+04 87.000000 1490.000000 137.000000 65.000000 1902.500000 172.000000 76.000000
max 1.000000 539426.000000 5.996050e+06 8240.000000 57000.000000 1068.000000 2857.000000 75000.000000 2136.000000 101.000000
In [18]:
Out[18]:
Index(['PID', 'Fraud', 'DeductibleAmt', 'AmtReimbursed', 'CountOfCID',
       'AvgAmtReimb', 'AvgAmtDed', 'CountOfBene', 'AvgReimbPerBene',
       'AvgDedPerBene', 'AvgAge'],
      dtype='object')
In [20]:
/home/aar/anaconda3/lib/python3.8/site-packages/seaborn/axisgrid.py:1912: UserWarning:

The `size` parameter has been renamed to `height`; please update your code.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-20-ad27b555eda5> in <module>
----> 1 sns.pairplot(df[['Fraud', 'DeductibleAmt', 'AmtReimbursed', 'CountOfCID',
      2        'AvgAmtReimb', 'AvgAmtDed', 'CountOfBene', 'AvgReimbPerBene',
      3        'AvgDedPerBene', 'AvgAge']],
      4              hue= 'Fraud' ,diag_kind= 'kde', plot_kws = {'alpha': 0.6, 's':80, 'edgecolor': 'k'}, size=4);

~/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
     44             )
     45         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46         return f(**kwargs)
     47     return inner_f
     48 

~/anaconda3/lib/python3.8/site-packages/seaborn/axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)
   1923     # Set up the PairGrid
   1924     grid_kws.setdefault("diag_sharey", diag_kind == "hist")
-> 1925     grid = PairGrid(data, vars=vars, x_vars=x_vars, y_vars=y_vars, hue=hue,
   1926                     hue_order=hue_order, palette=palette, corner=corner,
   1927                     height=height, aspect=aspect, dropna=dropna, **grid_kws)

~/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
     44             )
     45         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46         return f(**kwargs)
     47     return inner_f
     48 

~/anaconda3/lib/python3.8/site-packages/seaborn/axisgrid.py in __init__(self, data, hue, hue_order, palette, hue_kws, vars, x_vars, y_vars, corner, diag_sharey, height, aspect, layout_pad, despine, dropna, size)
   1233         if despine:
   1234             utils.despine(fig=fig)
-> 1235         self.tight_layout(pad=layout_pad)
   1236 
   1237     def map(self, func, **kwargs):

~/anaconda3/lib/python3.8/site-packages/seaborn/axisgrid.py in tight_layout(self, *args, **kwargs)
     62         if self._tight_layout_pad is not None:
     63             kwargs.setdefault("pad", self._tight_layout_pad)
---> 64         self.fig.tight_layout(*args, **kwargs)
     65 
     66     def add_legend(self, legend_data=None, title=None, label_order=None,

~/anaconda3/lib/python3.8/site-packages/matplotlib/cbook/deprecation.py in wrapper(*inner_args, **inner_kwargs)
    409                          else deprecation_addendum,
    410                 **kwargs)
--> 411         return func(*inner_args, **inner_kwargs)
    412 
    413     return wrapper

~/anaconda3/lib/python3.8/site-packages/matplotlib/figure.py in tight_layout(self, renderer, pad, h_pad, w_pad, rect)
   2611                else suppress())
   2612         with ctx:
-> 2613             kwargs = get_tight_layout_figure(
   2614                 self, self.axes, subplotspec_list, renderer,
   2615                 pad=pad, h_pad=h_pad, w_pad=w_pad, rect=rect)

~/anaconda3/lib/python3.8/site-packages/matplotlib/tight_layout.py in get_tight_layout_figure(fig, axes_list, subplotspec_list, renderer, pad, h_pad, w_pad, rect)
    328             top -= (1 - kwargs["top"])
    329 
--> 330         kwargs = auto_adjust_subplotpars(fig, renderer,
    331                                          nrows_ncols=(max_nrows, max_ncols),
    332                                          num1num2_list=num1num2_list,

~/anaconda3/lib/python3.8/site-packages/matplotlib/tight_layout.py in auto_adjust_subplotpars(fig, renderer, nrows_ncols, num1num2_list, subplot_list, ax_bbox_list, pad, h_pad, w_pad, rect)
     82             if ax.get_visible():
     83                 try:
---> 84                     bb += [ax.get_tightbbox(renderer, for_layout_only=True)]
     85                 except TypeError:
     86                     bb += [ax.get_tightbbox(renderer)]

~/anaconda3/lib/python3.8/site-packages/matplotlib/axes/_base.py in get_tightbbox(self, renderer, call_axes_locator, bbox_extra_artists, for_layout_only)
   4154             if self.xaxis.get_visible():
   4155                 try:
-> 4156                     bb_xaxis = self.xaxis.get_tightbbox(
   4157                         renderer, for_layout_only=for_layout_only)
   4158                 except TypeError:

~/anaconda3/lib/python3.8/site-packages/matplotlib/axis.py in get_tightbbox(self, renderer, for_layout_only)
   1107             return
   1108 
-> 1109         ticks_to_draw = self._update_ticks()
   1110 
   1111         self._update_label_position(renderer)

~/anaconda3/lib/python3.8/site-packages/matplotlib/axis.py in _update_ticks(self)
   1020         """
   1021         major_locs = self.get_majorticklocs()
-> 1022         major_labels = self.major.formatter.format_ticks(major_locs)
   1023         major_ticks = self.get_major_ticks(len(major_locs))
   1024         self.major.formatter.set_locs(major_locs)

~/anaconda3/lib/python3.8/site-packages/matplotlib/ticker.py in format_ticks(self, values)
    247     def format_ticks(self, values):
    248         """Return the tick labels for all the ticks at once."""
--> 249         self.set_locs(values)
    250         return [self(value, i) for i, value in enumerate(values)]
    251 

~/anaconda3/lib/python3.8/site-packages/matplotlib/ticker.py in set_locs(self, locs)
    780                 self._compute_offset()
    781             self._set_order_of_magnitude()
--> 782             self._set_format()
    783 
    784     def _compute_offset(self):

~/anaconda3/lib/python3.8/site-packages/matplotlib/ticker.py in _set_format(self)
    882         thresh = 1e-3 * 10 ** loc_range_oom
    883         while sigfigs >= 0:
--> 884             if np.abs(locs - np.round(locs, decimals=sigfigs)).max() < thresh:
    885                 sigfigs -= 1
    886             else:

<__array_function__ internals> in round_(*args, **kwargs)

~/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py in round_(a, decimals, out)
   3635     around : equivalent function; see for details.
   3636     """
-> 3637     return around(a, decimals=decimals, out=out)
   3638 
   3639 

<__array_function__ internals> in around(*args, **kwargs)

KeyboardInterrupt: 

df.fillna(0) Zero df.dropna(0)

df.fillna(method ='pad')previous

df.fillna(method ='bfill')nextone

In [21]:
Out[21]:
Fraud DeductibleAmt AmtReimbursed CountOfCID AvgAmtReimb AvgAmtDed CountOfBene AvgReimbPerBene AvgDedPerBene AvgAge
Fraud 1.000000 0.532070 0.575558 0.374197 0.193802 0.205323 0.393531 0.223881 0.239752 0.001716
DeductibleAmt 0.532070 1.000000 0.961801 0.319308 0.237398 0.295842 0.417258 0.264313 0.331850 0.012651
AmtReimbursed 0.575558 0.961801 1.000000 0.542463 0.205229 0.233973 0.613981 0.236124 0.269787 0.011656
CountOfCID 0.374197 0.319308 0.542463 1.000000 -0.080841 -0.094245 0.942310 -0.056805 -0.078149 0.002431
AvgAmtReimb 0.193802 0.237398 0.205229 -0.080841 1.000000 0.841271 -0.074621 0.984850 0.837690 0.030669
AvgAmtDed 0.205323 0.295842 0.233973 -0.094245 0.841271 1.000000 -0.085396 0.823499 0.992150 0.034160
CountOfBene 0.393531 0.417258 0.613981 0.942310 -0.074621 -0.085396 1.000000 -0.055838 -0.070863 0.006823
AvgReimbPerBene 0.223881 0.264313 0.236124 -0.056805 0.984850 0.823499 -0.055838 1.000000 0.837226 0.029810
AvgDedPerBene 0.239752 0.331850 0.269787 -0.078149 0.837690 0.992150 -0.070863 0.837226 1.000000 0.033845
AvgAge 0.001716 0.012651 0.011656 0.002431 0.030669 0.034160 0.006823 0.029810 0.033845 1.000000

Lets define X, and y for our dataset:

In [22]:
(5410, 8) (5410,)
In [23]:
In [24]:
Train set: (4869, 8) (4869,)
Test set: (541, 8) (541,)

Modeling (Scikit-learn)

Based on the count of each section, we can calculate precision and recall of each label:

  • Precision is a measure of the accuracy provided that a class label has been predicted. It is defined by: precision = TP / (TP + FP)

  • Recall is true positive rate. It is defined as: Recall = TP / (TP + FN)

So, we can calculate precision and recall of each class.

F1 score: Now we are in the position to calculate the F1 scores for each label based on the precision and recall of that label.

The F1 score is the harmonic average of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. It is a good way to show that a classifer has a good value for both recall and precision.

And finally, we can tell the average accuracy for this classifier is the average of the F1-score for both labels, which is 0.72 in our case.

LOGISTIC REGRESSION

In [25]:
Confusion matrix
[[437  57]
 [  7  40]]
Normalized confusion matrix
[[0.88 0.12]
 [0.15 0.85]]
In [29]:
              precision    recall  f1-score   support

           0       0.98      0.88      0.93       494
           1       0.41      0.85      0.56        47

    accuracy                           0.88       541
   macro avg       0.70      0.87      0.74       541
weighted avg       0.93      0.88      0.90       541

In [30]:
Out[30]:
array([0.85, 0.85, 0.85, 0.85, 0.85])
In [31]:
Out[31]:
0.8817005545286506

SVM

In [35]:
Confusion matrix
[[447  47]
 [  6  41]]
Normalized confusion matrix
[[0.9  0.1 ]
 [0.13 0.87]]
In [33]:
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       494
           1       0.47      0.87      0.61        47

    accuracy                           0.90       541
   macro avg       0.73      0.89      0.78       541
weighted avg       0.94      0.90      0.91       541

Random Forest Classifier

In [29]:
In [30]:
              precision    recall  f1-score   support

           0       0.99      0.87      0.93       494
           1       0.41      0.91      0.57        47

    accuracy                           0.88       541
   macro avg       0.70      0.89      0.75       541
weighted avg       0.94      0.88      0.90       541

In [31]:

XGBoost

In [32]:
In [33]:
Start Feeding Data
Start Training
Out[33]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=21, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=27, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=27, subsample=0.6, tree_method='exact',
              validate_parameters=1, verbosity=None)
In [34]:
Confusion matrix
[[459  35]
 [ 10  37]]
Normalized confusion matrix
[[459  35]
 [ 10  37]]
In [36]:
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-36-15c028a918c9> in <module>
      1 LABELS=['Fraud','not_Fraud']
----> 2 conf_matrix = confusion_matrix(y_test, yhat_xgb)
      3 plt.figure(figsize=(6, 6))
      4 sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
      5 plt.title("Confusion matrix")

NameError: name 'yhat_xgb' is not defined

In [37]:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95       494
           1       0.51      0.79      0.62        47

    accuracy                           0.92       541
   macro avg       0.75      0.86      0.79       541
weighted avg       0.94      0.92      0.92       541

In [39]:
Out[39]:
<AxesSubplot:>
In [40]:
Out[40]:
array([1, 6, 3, 5, 7, 2, 4, 0])
In [41]:
In [42]:
AmtReimbursed 0.6838422
AvgReimbPerBene 0.056488708
AvgAmtReimb 0.04833007
CountOfBene 0.04700127
AvgDedPerBene 0.04626358
CountOfCID 0.042688664
AvgAmtDed 0.038836647
DeductibleAmt 0.036548927
In [43]:

Summary

In [47]:
LR Accuracy             :  0.88
SVM Accuracy            :  0.9
Random Forest  Accuracy :  0.88
XGB            Accuracy :  0.92
In [ ]: